\[ \require{mhchem} \]
library(tidyverse)
library(chron)
source("functions_extra.R")
Define options
Sys.setlocale("LC_TIME","C")
options(stringsAsFactors=FALSE)
options(chron.year.abb=FALSE)
theme_set(theme_bw()) # just my preference for plots
Based on past work, we can define a function that reads in the data and additionally provides several time variables.
R provides many functions for extraction of time information, but for atmospheric applications we often classify time periods according to season (which is not provided). We will define our own function to convert month to season:
Month2Season <- function(month) {
## month is an integer (1-12)
## a factor with levels {"DJF", "MAM", "JJA", "SON"} is returned
seasons <- c("DJF", "MAM", "JJA", "SON")
index <- findInterval(month %% 12, seq(0, 12, 3))
factor(seasons[index], seasons)
}
Test this new function:
Month2Season(c(1, 3, 12))
## [1] DJF MAM DJF
## Levels: DJF MAM JJA SON
Next, we define the function for importing the time series:
ReadTSeries <- function(filename, timecolumn="datetime", timeformat="%d.%m.%Y %H:%M") {
## read the table, strip units in column names, rename time column
## and change data type of time column from a string of characters to
## a numeric type so that we can perform operations on it
data <- read.table(filename, skip=5, header=TRUE, sep=";", check.names=FALSE)
names(data) <- sub("[ ].*$","",names(data)) # strip units for simplification
names(data) <- sub("Date/time", timecolumn, names(data), fixed=TRUE)
data[,timecolumn] <- as.chron(data[,timecolumn], timeformat) - 1/24 # end time -> start time
## extract additional variables from the time column
data[,"year"] <- years(data[,timecolumn])
data[,"month"] <- months(data[,timecolumn])
data[,"day"] <- days(data[,timecolumn])
data[,"hour"] <- hours(data[,timecolumn])
data[,"dayofwk"] <- weekdays(data[,timecolumn])
data[,"daytype"] <- ifelse(data[,"dayofwk"] %in% c("Sat","Sun"), "Weekend", "Weekday")
data[,"season"] <- Month2Season(unclass(data[,"month"]))
## return value
data
}
Read and merge (with full_join) Lausanne (LAU) and Zürich (ZUE) data:
datapath <- file.path("data", "2013")
df <- full_join(cbind(site="LAU", ReadTSeries(file.path(datapath, "LAU.csv"))),
cbind(site="ZUE", ReadTSeries(file.path(datapath, "ZUE.csv"))))
## Joining, by = c("site", "datetime", "O3", "NO2", "CO", "PM10", "TEMP", "PREC", "RAD", "year", "month", "day", "hour", "dayofwk", "daytype", "season")
We can see that this data frame contains data from both sites.
head(df)
## site datetime O3 NO2 CO PM10 TEMP PREC RAD year month day
## 1 LAU (12/31/2012 00:00:00) 7.8 56.3 0.5 16.1 3.8 0 -2.4 2012 Dec 31
## 2 LAU (12/31/2012 01:00:00) 22.4 38.0 0.4 11.6 4.1 0 -2.3 2012 Dec 31
## 3 LAU (12/31/2012 02:00:00) 14.5 37.2 0.3 10.3 3.1 0 -2.1 2012 Dec 31
## 4 LAU (12/31/2012 03:00:00) 28.7 25.4 0.3 10.5 3.5 0 -2.2 2012 Dec 31
## 5 LAU (12/31/2012 04:00:00) 19.6 33.7 0.3 9.0 2.9 0 -2.2 2012 Dec 31
## 6 LAU (12/31/2012 05:00:00) 30.8 51.2 0.3 8.7 3.2 0 -2.3 2012 Dec 31
## hour dayofwk daytype season SO2 NMVOC EC
## 1 0 Mon Weekday DJF NA NA NA
## 2 1 Mon Weekday DJF NA NA NA
## 3 2 Mon Weekday DJF NA NA NA
## 4 3 Mon Weekday DJF NA NA NA
## 5 4 Mon Weekday DJF NA NA NA
## 6 5 Mon Weekday DJF NA NA NA
tail(df)
## site datetime O3 NO2 CO PM10 TEMP PREC RAD year month
## 17563 ZUE (12/31/2013 18:00:00) 1.4 49.1 0.6 28.5 1.0 0 -2.7 2013 Dec
## 17564 ZUE (12/31/2013 19:00:00) 1.5 47.9 0.6 31.9 0.2 0 -2.4 2013 Dec
## 17565 ZUE (12/31/2013 20:00:00) 2.0 48.6 0.7 34.1 -0.1 0 -2.5 2013 Dec
## 17566 ZUE (12/31/2013 21:00:00) 2.5 48.4 0.7 40.7 0.0 0 -2.4 2013 Dec
## 17567 ZUE (12/31/2013 22:00:00) 2.2 43.0 0.6 48.5 -0.4 0 -2.5 2013 Dec
## 17568 ZUE (12/31/2013 23:00:00) 2.4 43.5 0.6 53.3 -0.7 0 -2.5 2013 Dec
## day hour dayofwk daytype season SO2 NMVOC EC
## 17563 31 18 Tue Weekday DJF 3.4 0.2 2.4
## 17564 31 19 Tue Weekday DJF 3.4 0.2 2.4
## 17565 31 20 Tue Weekday DJF 3.3 0.2 2.6
## 17566 31 21 Tue Weekday DJF 4.2 0.2 2.6
## 17567 31 22 Tue Weekday DJF 3.6 0.2 2.5
## 17568 31 23 Tue Weekday DJF 4.4 0.2 2.5
Let us save this data frame for later.
saveRDS(df, "data/2013/lau-zue.rds")
Elongate the data frame, as before.
lf <- df %>%
gather(variable, value,
-c(site, datetime, season, year, month, day, hour, dayofwk, daytype))
Plotting your data is very good practice. Check for general trends and extreme values.
View all the measurements:
ggplot(lf)+ # `lf` is the data frame
facet_grid(variable~site, scale="free_y")+ # panels created out of these variables
geom_line(aes(datetime, value, color=site))+ # plot `value` vs. `time` as lines
scale_x_chron()+ # format x-axis labels (time units)
theme(axis.text.x=element_text(angle=30, hjust=1)) # rotate x-axis labels
In the following figures, we will summarize the measurements using non-parametric (order) statistics, which we will cover in a subsequent lecture.
Here we will use ggplot to compute and display statistical summaries. A box and whisker plot displays the 25th, 50th, and 75th percentiles of the data using a box, and 1.5 times the interquartile range (75th minus 25th percentile interval) using whiskers that extend beyond the box. Points which lie outside this range are denoted by individual symbols. Calling geom_boxplot will combine computation and display of these summaries for each categorical variable use for paneling (faceting) and grouping (along the x-axis in the following examples).
Display summary by month:
ggplot(lf) +
facet_grid(variable ~ site, scale = "free_y") +
geom_boxplot(aes(month, value), outlier.size = 0.5, outlier.shape = 3)
By day type and season:
lf %>%
filter(site=="LAU" & !is.na(value)) %>%
ggplot +
facet_grid(variable ~ season, scale = "free_y") +
geom_boxplot(aes(daytype, value), outlier.size = 0.5, outlier.shape = 3)
Note that we have piped the data frame to the
ggplot function (but note that the notation changes from %>% when manipulating data to + when combining plotting elements).
The precipitation may be better viewed in this instance as a cumulative sum or daily average (to compare across weekdays and weekends, which have different number of days).
lf %>%
filter(site=="LAU" & !is.na(value) & variable=="PREC") %>%
ggplot +
facet_grid(variable ~ season, scale = "free_y") +
geom_bar(aes(daytype, value), stat="summary", fun="mean", show.legend = FALSE) +
scale_y_continuous("Daily mean precipitation (mm)", expand=expansion(mult=c(0, 0.1)))
The following function returns a function to be used for calculation of error bars.
Percentile <- function(perc) function(x)
## `perc` is the percentile which should be computed for the numeric vector `x`
quantile(x, perc*1e-2, na.rm=TRUE)
Here we will again use ggplot to compute and display a set of statistical summaries in a different way. We specify both the data and mapping within ggplot, and use geom_line to display the computed medians and geom_errorbar to display the computed 25th and 75th percentiles.
Diurnal (hourly) variations in pollutant concentrations at Lausanne site:
lf %>%
filter(site=="LAU" & !is.na(value)) %>%
ggplot(aes(x=hour, y=value, group=daytype, color=daytype)) +
facet_grid(variable ~ season, scale = "free_y", drop=TRUE) +
geom_line(stat="summary", fun="median")+
geom_errorbar(stat="summary",
fun.min=Percentile(25),
fun.max=Percentile(75))+
ggtitle("LAU")
Diurnal variations in \(\ce{O3}\) concentrations:
lf %>%
filter(variable=="O3") %>%
ggplot(aes(x=hour, y=value, group=daytype, color=daytype)) +
facet_grid(site ~ season, drop=TRUE) +
geom_line(stat="summary", fun="median")+
geom_errorbar(stat="summary",
fun.min=Percentile(25),
fun.max=Percentile(75))+
ggtitle("O3")
Note that for concentrations of the same pollutant, we fix the y-scale to be the same for both rows.
Diurnal variations in \(\ce{NO2}\) concentrations:
lf %>%
filter(variable=="NO2") %>%
ggplot(aes(x=hour, y=value, group=site, color=site)) +
facet_grid(season ~ dayofwk, drop=TRUE) +
geom_line(stat="summary", fun="median")+
geom_errorbar(stat="summary",
fun.min=Percentile(25),
fun.max=Percentile(75))+
ggtitle("NO2")
Why are concentrations in Lausanne higher? (hint: check location of monitoring equipment)
Ozone can be quenched near \(\ce{NO_x}\) emission sources by the titration of \(\ce{O3}\) by \(\ce{NO}\). The sum of oxidants (\(\ce{O_x}\) = \(\ce{O3}\) + \(\ce{NO2}\)) is a conserved quantity in the “NO titration” reaction (\(\ce{NO + O3 -> NO2 + O2}\)), and is sometimes examined alongside \(\ce{O_3}\).
Ox <- lf %>%
filter(variable %in% c("O3", "NO2") & season=="JJA") %>%
spread(variable, value) %>%
mutate(Ox = O3 + NO2) %>%
select(-NO2) %>%
gather(variable, value, c(O3, Ox))
Ox %>%
ggplot(aes(x=hour, y=value, group=variable, color=variable)) +
facet_grid(site ~ daytype, drop=TRUE) +
geom_line(stat="summary", fun="median")+
geom_errorbar(stat="summary",
fun.min=Percentile(25),
fun.max=Percentile(75))
We can also view the relative contribution of \(\ce{NO2}\) to \(\ce{O_x}\) to understand the diurnal variation in \(\ce{O_x}\).
Ox %>%
spread(variable, value) %>%
mutate(ratio = 1 - O3/Ox) %>%
ggplot(aes(x=hour, y=ratio)) +
facet_grid(site ~ daytype, drop=TRUE) +
geom_line(stat="summary", fun="median")+
geom_errorbar(stat="summary",
fun.min=Percentile(25),
fun.max=Percentile(75)) +
scale_y_continuous(expression(NO[2]/O[x]), limits=c(0, 1))
We can more generally summarize extreme values that exceedance daily limit values set forth by regulation in Switzerland.
limits.daily <- data.frame(value=c(100,80,8,50),
variable=c("SO2","NO2","CO","PM10"))
Let us compute the daily means, but also note the percent recovery of data. Sometimes, measurements are not available for all periods for different reasons - e.g., there was an instrument malfunction, or because the instrument was taken offline for calibration. If the values used to compute the “daily mean” does not constitute a full day, then how representative is this mean? Taking into consideration such irregularities in data that violate assumptions of the statistics you will compute is part of a broader task known as “data cleaning”.
In the syntax below, note the use of group_by (split) and summarize (aggregate) operations (the results are automatically combined) for computing summary statistics (percent.recovery and the mean value). The ungroup is optional but allows us to specify other grouping variables in the future (otherwise, the grouping variables would remain fixed for the data table, daily).
daily <- lf %>%
filter(variable %in% limits.daily[["variable"]]) %>% # select variables
mutate(date = dates(datetime)) %>% # get the date value
group_by(site, date, variable) %>%
summarize(percent.recovery = length(na.omit(value))/length(value)*1e2,
value = mean(value, na.rm=TRUE)) %>%
ungroup() # undo grouping for future use
## `summarise()` has grouped output by 'site', 'date'. You can override using the `.groups` argument.
The selection of threshold is often arbitrary, but a threshold recovery of 75% or 80% is typically used in practice to claim a valid mean. We will use 75%:
threshold <- 75
Let us see how many days the data recovery is at or below this threshold for each variable:
daily %>%
filter(percent.recovery < threshold) %>%
count(site, variable)
## # A tibble: 3 x 3
## site variable n
## <chr> <chr> <int>
## 1 LAU PM10 6
## 2 LAU SO2 366
## 3 ZUE PM10 8
Lausanne does not have an SO2 monitor, so this makes sense, and we are only missing a few days of PM\(_{10}\) measurements at each site. We can see which dates we do not have adequate data recovery:
filter(daily, percent.recovery < threshold & variable=="PM10")
## # A tibble: 14 x 5
## site date variable percent.recovery value
## <chr> <dates> <chr> <dbl> <dbl>
## 1 LAU 06/30/2013 PM10 62.5 20.6
## 2 LAU 07/01/2013 PM10 37.5 22.1
## 3 LAU 07/02/2013 PM10 29.2 16.5
## 4 LAU 10/24/2013 PM10 41.7 11.1
## 5 LAU 10/25/2013 PM10 50 29.4
## 6 LAU 11/14/2013 PM10 0 NaN
## 7 ZUE 05/30/2013 PM10 58.3 8.76
## 8 ZUE 05/31/2013 PM10 50 2.89
## 9 ZUE 06/07/2013 PM10 29.2 15.7
## 10 ZUE 06/08/2013 PM10 0 NaN
## 11 ZUE 06/09/2013 PM10 0 NaN
## 12 ZUE 06/10/2013 PM10 0 NaN
## 13 ZUE 06/11/2013 PM10 45.8 11.9
## 14 ZUE 11/14/2013 PM10 0 NaN
What can you do when you have such missing values? One approach is to remove means computed for dates with less than the required threshold of data recovery. Another approach used in statistics is called imputation, whereby missing values are populated with best estimates for its value (e.g., by interpolation or by drawing from a well-characterized distribution). For this exercise, we will simply take the first approach.
Let us visualize the time series with limit values indicated for each variable:
daily %>%
filter(percent.recovery >= threshold) %>%
ggplot+
facet_grid(variable~site, scale="free_y")+
geom_line(aes(x=date, y=value))+
geom_hline(data=limits.daily, mapping=aes(yintercept=value), linetype=2)+
scale_x_chron(format="%d.%m")+
theme(axis.text.x=element_text(angle=30, hjust=1))
Note that in the command above, the data used for drawing horizontal lines are specified separately for the
geom_hline function.
We can also view exceedances through empirical cumulative distribution functions (ECDF) of concentrations (to be covered in a later lecture):
daily %>%
filter(percent.recovery >= threshold) %>%
ggplot+
facet_grid(variable~site, scale="free_y")+
geom_line(aes(x=value), stat="ecdf")+
geom_point(aes(x=value), stat="ecdf")+
geom_vline(data=limits.daily, mapping=aes(xintercept=value), linetype=2)
To select which days exceed the limit values, we will use a “lookup table” in which the limit value can be referred to by its key (variable name). The following statement creates a named vector where limit values (value) is labeled by the pollutant (variable):
(limits.vec <- with(limits.daily, setNames(value, variable)))
## SO2 NO2 CO PM10
## 100 80 8 50
We can then use this vector (lookup table) to select the dates which exceed the limit values for each variable:
exceedances <- daily %>%
filter(percent.recovery >= threshold &
value > limits.vec[as.character(variable)])
Let us view this data table:
head(exceedances)
## # A tibble: 6 x 5
## site date variable percent.recovery value
## <chr> <dates> <chr> <dbl> <dbl>
## 1 LAU 02/14/2013 PM10 100 50.8
## 2 LAU 02/26/2013 PM10 100 72.0
## 3 LAU 02/27/2013 PM10 100 65.8
## 4 LAU 02/28/2013 PM10 100 71.3
## 5 LAU 03/01/2013 PM10 100 68.9
## 6 LAU 03/03/2013 PM10 100 55.8
tail(exceedances)
## # A tibble: 6 x 5
## site date variable percent.recovery value
## <chr> <dates> <chr> <dbl> <dbl>
## 1 ZUE 03/27/2013 PM10 100 52.0
## 2 ZUE 03/28/2013 PM10 100 60.8
## 3 ZUE 04/05/2013 PM10 100 56.0
## 4 ZUE 04/06/2013 PM10 100 54.3
## 5 ZUE 12/17/2013 NO2 100 87.8
## 6 ZUE 12/18/2013 NO2 100 90.0
If we want a summary of the number of exceedances, count serves the purpose of group_by and summarize with a single function:
exceedances %>%
count(site, variable)
## # A tibble: 4 x 3
## site variable n
## <chr> <chr> <int>
## 1 LAU NO2 1
## 2 LAU PM10 16
## 3 ZUE NO2 4
## 4 ZUE PM10 8
If we want the summary in monthly resolution, we can make a simple modification to the code above:
exceedances %>%
mutate(month = months(date)) %>%
count(site, variable, month)
## # A tibble: 9 x 4
## site variable month n
## <chr> <chr> <ord> <int>
## 1 LAU NO2 Mar 1
## 2 LAU PM10 Feb 4
## 3 LAU PM10 Mar 8
## 4 LAU PM10 Apr 4
## 5 ZUE NO2 Mar 2
## 6 ZUE NO2 Dec 2
## 7 ZUE PM10 Feb 2
## 8 ZUE PM10 Mar 4
## 9 ZUE PM10 Apr 2
We can export this table with the following command:
write.csv2(exceedances, file="exceedances.csv", row.names=FALSE)
Note that write.csv2 uses the European convention for comma-separated-value files, where the delimiter is actually a semicolon (;) rather than comma (,).